/*
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;

namespace matrix_multiplacation
{
public class Ink_Programme
{
    static Vector256<float>[,] G = new Vector256<float>[4, 3]
    {
            {Vector256.Create(1f), Vector256.Create(0f), Vector256.Create(0f)},
            {Vector256.Create(0.5f),Vector256.Create(0.5f),Vector256.Create(0.5f)},
            {Vector256.Create(0.5f),Vector256.Create(-0.5f),Vector256.Create(0.5f)},
            {Vector256.Create(0f),Vector256.Create(0f),Vector256.Create(1f)}
    };

    static Vector256<float>[,] GT = new Vector256<float>[3, 4]
    {
            {Vector256.Create(1f), Vector256.Create(0.5f), Vector256.Create(0.5f), Vector256.Create(0f)},
            { Vector256.Create(0f), Vector256.Create(0.5f), Vector256.Create(-0.5f), Vector256.Create(0f) },
            { Vector256.Create(0f), Vector256.Create(0.5f), Vector256.Create(0.5f), Vector256.Create(1f) }
    };

    static Vector256<float>[,] BT = new Vector256<float>[4, 4]
    {
            {Vector256.Create(1f), Vector256.Create(0f), Vector256.Create(-1f), Vector256.Create(0f)},
            { Vector256.Create(0f), Vector256.Create(1f), Vector256.Create(1f), Vector256.Create(0f) },
            { Vector256.Create(0f), Vector256.Create(-1f), Vector256.Create(1f), Vector256.Create(0f) },
            { Vector256.Create(0f), Vector256.Create(1f), Vector256.Create(0f), Vector256.Create(-1f) }
    };
    static Vector256<float>[,] B = new Vector256<float>[4, 4]
    {
            {Vector256.Create(1f), Vector256.Create(0f), Vector256.Create(0f), Vector256.Create(0f)},
            { Vector256.Create(0f), Vector256.Create(1f), Vector256.Create(-1f), Vector256.Create(1f) },
            { Vector256.Create(-1f), Vector256.Create(1f), Vector256.Create(1f), Vector256.Create(0f) },
            { Vector256.Create(0f), Vector256.Create(0f), Vector256.Create(0f), Vector256.Create(-1f) }
    };
    static Vector256<float>[,] AT = new Vector256<float>[2, 4]
    {
            {Vector256.Create(1f), Vector256.Create(1f), Vector256.Create(1f), Vector256.Create(0f)},
            {Vector256.Create(0f), Vector256.Create(1f), Vector256.Create(-1f), Vector256.Create(-1f)}
    };
    static Vector256<float>[,] A = new Vector256<float>[4, 2]
    {
            {Vector256.Create(1f), Vector256.Create(0f)},
            { Vector256.Create(1f), Vector256.Create(1f) },
            { Vector256.Create(1f), Vector256.Create(-1f) },
            { Vector256.Create(0f), Vector256.Create(1f) }
    };

    static Vector256<float>[,] doGgGT(Vector256<float>[,] g)
    {
        Vector256<float>[,] tmp = new Vector256<float>[4, 3];
        uint K = 3;
        uint M = 4;
        uint N = 3;
        for (uint m = 0; m < M; m++)
        {
            for (uint n = 0; n < N; n++)
            {
                Vector256<float> val = Vector256.Create(0.0f);
                for (uint k = 0; k < K; k++)
                {

                    val = Avx.Add(val, Avx.Multiply(G[m, k], g[k, n]));
                }
                tmp[m, n] = val;
            }
        }
        Vector256<float>[,] ret = new Vector256<float>[4, 4];
        K = 3; M = 4; N = 4;
        for (uint m = 0; m < M; m++)
        {
            for (uint n = 0; n < N; n++)
            {
                Vector256<float> val = Vector256.Create(0.0f);
                for (uint k = 0; k < K; k++)
                {
                    val = Avx.Add(val, Avx.Multiply(tmp[m, k], GT[k, n]));
                }
                ret[m, n] = val;
            }
        }
        return ret;
    }

    static Vector256<float>[][] doBTdB(Vector256<float>[][] d)
    {
        Vector256<float>[][] tmp = new Vector256<float>[4][];
        uint K = 4;
        uint M = 4;
        uint N = 4;
        for (uint m = 0; m < M; m++)
        {
            for (uint n = 0; n < N; n++)
            {
                Vector256<float> val = Vector256.Create(0.0f);
                for (uint k = 0; k < K; k++)
                {
                    val = Avx.Add(val, Avx.Multiply(BT[m, k], d[k][n]));
                }
                tmp[m][n] = val;
            }
        }
        Vector256<float>[][] ret = new Vector256<float>[4][];
        for (uint m = 0; m < M; m++)
        {
            for (uint n = 0; n < N; n++)
            {
                Vector256<float> val = Vector256.Create(0.0f);
                for (uint k = 0; k < K; k++)
                {
                    val = Avx.Add(val, Avx.Multiply(tmp[m][k], B[k, n]));
                }
                ret[m][n] = val;
            }
        }
        return ret;
    }

    static Vector256<float>[][] doElementMul(Vector256<float>[,] g, Vector256<float>[][] d)
    {
        Vector256<float>[][] ret = new Vector256<float>[4][];
        uint M = 4;
        uint N = 4;
        for (uint m = 0; m < M; m++)
        {
            for (uint n = 0; n < N; n++)
            {
                ret[m][n] = Avx.Multiply(g[m, n], d[m][n]);
            }
        }
        return ret;
    }

    static Vector256<float>[][] doATaA(Vector256<float>[][] a)
    {
        Vector256<float>[][] tmp = new Vector256<float>[2][];
        uint K = 4;
        uint M = 2;
        uint N = 4;
        for (uint m = 0; m < M; m++)
        {
            for (uint n = 0; n < N; n++)
            {
                Vector256<float> val = Vector256.Create(0.0f);
                for (uint k = 0; k < K; k++)
                {
                    val = Avx.Add(val, Avx.Multiply(AT[m, k], a[k][n]));
                }
                tmp[m][n] = val;
            }
        }
        Vector256<float>[][] ret = new Vector256<float>[2][];
        K = 4; M = 2; N = 2;
        for (uint m = 0; m < M; m++)
        {
            for (uint n = 0; n < N; n++)
            {
                Vector256<float> val = Vector256.Create(0.0f);
                for (uint k = 0; k < K; k++)
                {
                    val = Avx.Add(val, Avx.Multiply(tmp[m][k], A[k, n]));
                }
                ret[m][n] = val;
            }
        }
        return ret;
    }
    
    static void Main(string[] args)
    {
        if (!Sse41.IsSupported)
        {
            Console.WriteLine("Your CPU doesn't support SSE4.1 Instruction set");
            return;
        }
        List<Vector256<float>[][]> image = new List<Vector256<float>[][]>(2048);
        Vector256<float>[,] filter = new Vector256<float>[3,3];
        List<Vector256<float>[][]> rets = new List<Vector256<float>[][]>(2048);

        for (int i = 0; i < image.ToArray().Length; i++)
        {
            for (int m = 0; m < 4; m++)
            {
                for (int n = 0; n < 4; n++)
                {
                    image[i][m][n] = Avx.Multiply(Vector256.Create(m + 1.0f), Vector256.Create(1.0f));
                }

            }
        }
        for (int m = 0; m < 3; m++)
        {
            for (int n = 0; n <3; n++)
            {
                filter[m,n] = Vector256.Create(1.0f);
            }
        }
        DateTime beforDT = System.DateTime.Now;
        for (int j = 0; j < image.ToArray().Length; j++)
        {
            rets[j] = doATaA(doElementMul(doGgGT(filter), doBTdB(image[j])));
        }
        DateTime afterDT = System.DateTime.Now;
        TimeSpan ts = afterDT.Subtract(beforDT);
        Console.Write("-----");
        Console.Write(ts.TotalMilliseconds);
        Console.Write("\n");
        for (int m = 0; m < 2; m++)
        {
            for (int n = 0; n < 2; n++)
            {
                Console.Write(" ");
            }
            Console.Write("\n");
        }
    }

}
}
*/
